TidyTuesday week 40: NBER papers, data from the National Bureau of Economic Research NBER by way of the nberwp package by Ben Davies.
library(tidyverse)
library(glue)
library(gt)
library(gtExtras)
papers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/papers.csv')
Rows: 29434 Columns: 4
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): paper, title
dbl (2): year, month
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/authors.csv')
Rows: 15437 Columns: 4
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): author, name, user_nber, user_repec
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
programs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/programs.csv')
Rows: 21 Columns: 3
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): program, program_desc, program_category
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
paper_authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_authors.csv')
Rows: 67090 Columns: 2
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): paper, author
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
paper_programs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-28/paper_programs.csv')
Rows: 53996 Columns: 2
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): paper, program
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
joined_df <- left_join(papers, paper_authors) %>%
left_join(authors) %>%
left_join(paper_programs) %>%
left_join(programs)%>%
mutate(
catalogue_group = str_sub(paper, 1, 1),
catalogue_group = case_when(
catalogue_group == "h" ~ "Historical",
catalogue_group == "t" ~ "Technical",
catalogue_group == "w" ~ "General"
),
.after = paper
)
Joining, by = "paper"
Joining, by = "author"
Joining, by = "paper"
Joining, by = "program"
wp = joined_df %>% group_by(program, program_desc, year) %>%
summarise(n=n_distinct(paper)) %>%
arrange(year) %>%
mutate(decade=case_when(between(year,1980,1989)~"1980s",
between(year,1990,1999)~"1990s",
between(year,2000,2009)~"2000s",
between(year,2010,2019)~"2010s"
)) %>%
drop_na()
`summarise()` has grouped output by 'program', 'program_desc'. You can override using the `.groups` argument.
wp2 = wp %>% mutate(program=glue::glue("{program_desc} ({program})")) %>%
group_by(program) %>% mutate(total=sum(n)) %>%
arrange(year, program) %>%
mutate(spark=list(n)) %>%
select(program, spark, total) %>%
distinct()
wp3 = wp %>% mutate(program=glue::glue("{program_desc} ({program})")) %>%
group_by(program, decade) %>% tally(n) %>%
ungroup() %>%
pivot_wider(names_from = decade, values_from=n) %>%
mutate_if(is.numeric, list(~replace_na(., 0)))
wp3 %>% inner_join(wp2, by="program") %>%
select(Program=program, Total=total, "1980s","1990s","2000s","2010s",Trend=spark) %>%
arrange(desc(Total)) %>%
gt() %>%
gt_theme_espn() %>%
cols_align(Program, align="left") %>%
gt_plt_dot(Total, Program,palette = "rcartocolor::ag_GrnYl", max_value=5246) %>%
gtExtras::gt_sparkline(Trend) %>%
tab_options(table.font.size = 12.5,
heading.subtitle.font.size = 14) %>%
gt_color_box(`1980s`, domain=2:786) %>%
gt_color_box(`1990s`, domain=2:797) %>%
gt_color_box(`2000s`, domain=132:1647) %>%
gt_color_box(`2010s`, domain=200:2424) %>%
tab_header(title="NBER Papers", subtitle="Working papers count by program and decade") %>%
tab_source_note(source_note="TidyTuesday Week 40 | Data source: National Bureau of Economic Research (NBER) by way of the nberwp package by Ben Davies")
| NBER Papers | ||||||
|---|---|---|---|---|---|---|
| Working papers count by program and decade | ||||||
| Program | Total | 1980s | 1990s | 2000s | 2010s | Trend |
Labor Studies (LS)
|
5246 | 489.0
|
797.0
|
1536.0
|
2424.0
|
|
Public Economics (PE)
|
5216 | 506.0
|
764.0
|
1561.0
|
2385.0
|
|
Economic Fluctuations and Growth (EFG)
|
5200 | 562.0
|
680.0
|
1647.0
|
2311.0
|
|
International Finance and Macroeconomics (IFM)
|
3820 | 786.0
|
699.0
|
1145.0
|
1190.0
|
|
International Trade and Investment (ITI)
|
3297 | 782.0
|
725.0
|
911.0
|
879.0
|
|
Monetary Economics (ME)
|
3020 | 509.0
|
504.0
|
782.0
|
1225.0
|
|
Asset Pricing (AP)
|
2426 | 0.0
|
307.0
|
927.0
|
1192.0
|
|
Productivity, Innovation, and Entrepreneurship (PR)
|
2182 | 111.0
|
290.0
|
636.0
|
1145.0
|
|
Health Economics (HE)
|
2039 | 87.0
|
143.0
|
620.0
|
1189.0
|
|
Corporate Finance (CF)
|
2034 | 2.0
|
175.0
|
718.0
|
1139.0
|
|
Development of the American Economy (DAE)
|
1532 | 59.0
|
230.0
|
540.0
|
703.0
|
|
Industrial Organization (IO)
|
1530 | 0.0
|
106.0
|
489.0
|
935.0
|
|
Children (CH)
|
1510 | 2.0
|
60.0
|
565.0
|
883.0
|
|
Economics of Aging (AG)
|
1490 | 43.0
|
221.0
|
495.0
|
731.0
|
|
Health Care (HC)
|
1464 | 0.0
|
153.0
|
479.0
|
832.0
|
|
Economics of Education (ED)
|
1417 | 0.0
|
2.0
|
436.0
|
979.0
|
|
Political Economics (POL)
|
1120 | 0.0
|
0.0
|
260.0
|
860.0
|
|
Environment and Energy Economics (EEE)
|
1096 | 2.0
|
13.0
|
256.0
|
825.0
|
|
Law and Economics (LE)
|
1020 | 20.0
|
72.0
|
353.0
|
575.0
|
|
Development Economics (DEV)
|
999 | 0.0
|
0.0
|
0.0
|
999.0
|
|
Technical Working Papers (TWP)
|
335 | 0.0
|
3.0
|
132.0
|
200.0
|
|
| TidyTuesday Week 40 | Data source: National Bureau of Economic Research (NBER) by way of the nberwp package by Ben Davies | ||||||
ALT text: The table showing the count of NBER papers, by program and decade, from 1980 to 2019, where Labor studies program have the highest total working paper (n=5246) in the time period and Technical Working papers have the lowest total (n=335). The table includes a spark line showing the yearly counts of working paper, where all programs have higher counts of working papers in recent years, except for International Trade and Investment program.
papers_joined <-
paper_programs %>%
left_join(programs) %>%
left_join(papers) %>%
filter(!is.na(program_category)) %>%
distinct(paper, program_category, year, title)
Joining, by = "program"
Joining, by = "paper"
library(Hmisc)
Hmisc::describe(papers_joined$program_category)
papers_joined$program_category
n missing distinct
34875 0 3
Value Finance Macro/International Micro
Frequency 4336 12012 18527
Proportion 0.124 0.344 0.531
library(ggthemes)
theme_set(theme_gdocs(base_size = 9))
theme_update(plot.margin = unit(c(1.1,.7,1,.7), "cm"),
plot.background = element_rect(color=NA),
strip.text.x=element_text(size = 10, color="black", margin=margin(b=10)))
library(tidytext)
library(tidylo)
title_log_odds <-
papers_joined %>%
unnest_tokens(word, title) %>%
filter(!is.na(program_category)) %>%
count(program_category, word, sort = TRUE) %>%
bind_log_odds(program_category, word, n)
title_log_odds %>%
group_by(program_category) %>%
slice_max(log_odds_weighted, n = 10) %>%
ungroup() %>%
ggplot(aes(log_odds_weighted,
fct_reorder(word, log_odds_weighted),
fill = program_category
)) +
geom_col(show.legend = FALSE, width=0.5, alpha=.8) +
facet_wrap(vars(program_category), scales = "free_y") +
labs(x = "Log odds (weighted)", y = NULL) +
theme(panel.grid.major.y=element_blank()) +
scale_fill_fivethirtyeight()
library(tidymodels)
library(themis)
library(textrecipes)
set.seed(123)
nber_split <- initial_split(papers_joined, strata = program_category)
nber_train <- training(nber_split)
nber_test <- testing(nber_split)
set.seed(234)
nber_folds <- vfold_cv(nber_train, strata = program_category)
nber_folds
# 10-fold cross-validation using stratification